Chapter 9 Structured Corpus
There are a lot of pre-collected corpora available for lingustic studies. This chapter will demonstrate how you can load existing corpora in R and perform basic corpus analysis with these data.
9.1 NCCU Spoken Mandarin
9.1.1 Loading the Corpus
9.1.2 Line Segmentation
9.1.3 Metadata vs. Transcript
NCCU_lines_meta <- NCCU_lines %>%
filter(str_detect(line, "^@"))
NCCU_lines_data <- NCCU_lines %>%
filter(str_detect(line, "^[^@]")) %>%
group_by(doc_id) %>%
mutate(lineID = row_number()) %>%
ungroup %>%
separate(line, into = c("SPID","line"), sep="\t") %>%
mutate(line2 = line %>%
str_replace_all("\\([(\\.)0-9]+?\\)"," <PAUSE> ") %>% # <PAUSE>
str_replace_all("\\&\\=[a-z]+"," <EXTRALING> ") %>% # <EXTRALING>
str_replace_all("[\u2308\u2309\u230a\u230b]"," ") %>% # overlapping talk tags
str_replace_all("@[a-z:]+"," ") %>% # code switching tags
str_replace_all("\\s+"," ") %>% # additional whitespaces
str_trim())
NCCU_lines_data9.1.4 Word Tokenization
NCCU_words <- NCCU_lines_data %>%
unnest_tokens(word, line2, token = function(x) str_split(x, "\\s+")) %>%
filter(word!="")
NCCU_words9.1.5 Word frequencies and Wordcloud
NCCU_words_freq <-NCCU_words %>%
count(word, doc_id) %>%
group_by(word) %>%
summarize(freq = sum(n), dispersion = n()) %>%
arrange(desc(freq), desc(dispersion))
# wordcloud
require(wordcloud2)
NCCU_words_freq %>%
filter(str_detect(word, "^[^<a-z]")) %>%
select(word, freq) %>%
#mutate(freq = log(freq)) %>%
wordcloud2::wordcloud2(minSize = 0.5, size=1, shape="diamonds")9.1.6 Concordances
9.1.7 N-grams (Lexical Bundles)
##########################
# Chinse ngrams functin #
##########################
# Generate ngram sequences from `text`
# By default, `text` is assumed to have whitespaces as delimiters between tokens
ngram_chi <- function(text, n = 2, delimiter = "_"){
word_vec = strsplit(text, "\\s") %>% unlist
if(length(word_vec)>=n){
map2_chr(.x= 1:(length(word_vec)-n+1),
.y = n:length(word_vec),
.f= function(x,y) str_c(word_vec[x:y], collapse=delimiter))
}else{
return("")
}#endif
}#endfunc
# Wrapper to Vectorize the function
vngram_chi <- Vectorize(ngram_chi, vectorize.args = "text")
NCCU_ngrams <- NCCU_lines_data %>%
select(-line, -SPID) %>%
unnest_tokens(ngram, line2, token = function(x) vngram_chi(text = x, n = 5, delimiter = "_")) %>%
filter(ngram != "") # remove empty tokens (due to the short lines)
NCCU_ngrams %>%
count(ngram, doc_id) %>%
group_by(ngram) %>%
summarize(freq = sum(n), dispersion = n()) %>%
arrange(desc(freq), desc(dispersion)) %>%
ungroup %>%
filter(!str_detect(ngram,"<")) -> NCCU_ngrams_freq
NCCU_ngrams_freq9.2 Connecting SPID to Metadata
# Self-defined function
fill_spid <- function(vec){
vec_filled <-vec
for(i in 1:length(vec_filled)){
if(vec_filled[i]==""){
vec_filled[i]<-vec_filled[i-1]
}else{
i <- i+1
} #endif
}#endfor
return(vec_filled)
}#endfunc
# Please check M005.cha
NCCU_lines_data %>%
group_by(doc_id) %>%
filter(lineID == 1 & SPID=="")# Remove the typo case
NCCU_lines_data_filled <- NCCU_lines_data %>%
filter(!(doc_id =="M005.cha" & lineID==1)) %>%
group_by(doc_id) %>%
mutate(SPID = str_replace_all(SPID, "[*:]","")) %>%
mutate(SPID_FILLED = fill_spid(SPID)) %>%
mutate(DOC_SPID = str_c(doc_id, SPID_FILLED, sep="_")) %>%
ungroup %>%
select(doc_id, lineID, line2, DOC_SPID)
NCCU_lines_data_filledBased on the metadata of each file hedaer, we can extract demographic information related to each speaker, including their ID, age, gender, etc.
NCCU_meta <- NCCU_lines_meta %>%
filter(str_detect(line, "^@(id)")) %>%
separate(line, into=str_c("V",1:11, sep=""), sep = "\\|") %>%
select(doc_id, V2, V3, V4, V5, V7, V10) %>%
mutate(DOC_SPID = str_c(doc_id, V3, sep="_")) %>%
rename(AGE = V4,
GENDER = V5,
GROUP = V7,
RELATION = V10,
LANG = V2) %>%
select(-V3)
NCCU_meta


